package net.matuschek.spider;

/**
 * This class implements a web robot that does a search trough
 * the web starting from a given start document up to a given 
 * search depth.
 * 
 * @author Daniel Matuschek / Oliver Schmidt 
 * @version $Revision: 1.35 $
 */

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.StringTokenizer;
import java.util.Vector;

import net.matuschek.html.FormFiller;
import net.matuschek.html.HtmlDocument;
import net.matuschek.http.DocManagerException;
import net.matuschek.http.DownloadRuleSet;
import net.matuschek.http.ExtendedURL;
import net.matuschek.http.HttpConstants;
import net.matuschek.http.HttpDoc;
import net.matuschek.http.HttpDocManager;
import net.matuschek.http.HttpException;
import net.matuschek.http.HttpHeader;
import net.matuschek.http.HttpTool;
import net.matuschek.http.HttpToolCallback;
import net.matuschek.http.NTLMAuthorization;
import net.matuschek.http.cookie.CookieManager;
import net.matuschek.spider.docfilter.FilterChain;
import net.matuschek.spider.docfilter.FilterException;

import org.apache.log4j.Category;
import org.w3c.dom.Element;

public class WebRobot implements Runnable, Cloneable {

    private String pathName;

	/** the name of the robot */
	private final static String ROBOT_NAME = "JoBo";

	/** the default agent name */
	private final static String AGENT_NAME = 
          ROBOT_NAME+"/1.4 (http://www.matuschek.net/jobo.html)";

	/** the robot exception handler*/
	protected RobotExceptionHandler exceptionHandler = 
          new DefaultRobotExceptionHandler();

	/** default maximal search depth */
	private final static int DEFAULT_DEPTH = 0;

	/** the URL where the robot walk starts from */
	protected URL startURL = null;

	/** the host and directory where retrieval started from */
	protected String startDir = "";

	/** maximal search depth */
	protected int maxDepth = DEFAULT_DEPTH;

	/** is it allowed to walk to other hosts then the starting host ? */
	protected boolean walkToOtherHosts = false;

	/** DocManager will store or process retrieved documents */
	protected HttpDocManager docManager;

	/** HttpTool will be used to retrieve documents from a web server */
	protected HttpTool httpTool = new HttpTool();

	/** Log4J category for logging */
	protected Category log;

	/** Referer used to retrieve to first document */
	protected String startReferer = "-";

	/** test for robots.txt */
	protected NoRobots robCheck;

	/** current tasks */
	protected TaskList todo = null;

	/** a list of all URLs we got already */
	protected TaskList visited = null;
	
	/** ignore settings in /robots.txt ? */
	protected boolean ignoreRobotsTxt = false;

	/** sleep that number of seconds after every retrieved document */
	protected int sleepTime = 0;

	/** fill out forms */
	protected FormFiller formFiller = new FormFiller();

	/** this URLs can be visited more then once */
	protected Vector visitMany = new Vector();

	/** for callback to the user interface **/
	protected WebRobotCallback webRobotCallback = null;

	/** should we stop robot operation ? **/
	protected boolean stopIt = false;

	/** to check if it is allowed to travel to a given URL **/
	protected URLCheck urlCheck = null;

	/** should the robot suspend the current walk() **/
	protected boolean sleep;

	/** list of allowed URLs (even if walkToOtherHosts is false) **/
	protected Vector allowedURLs = new Vector();

	/** allow travelling the whole host ? */
	protected boolean allowWholeHost = true;

	/** 
	 * maximum document age in seconds, negative value means
	 * no limit 
	 */
	protected long maxDocumentAge = -1; // no limit

	/** 
	 * allow travelling to all subdomains of the start host ? 
	 * @see #setAllowWholeDomain(boolean)
	 */
	protected boolean allowWholeDomain = true;

	/** 
	 * do more flexible tests if the new URL is on the same host
	 * @see #basicURLCheck(URL)
	 */
	protected boolean flexibleHostCheck = false;

	/**
	 * FilterChain to filter the document before storing it
	 */
	protected FilterChain filters = null;

	/**
	 * don't retrieve pages again that are already stored in the DocManager
	 */
	protected boolean allowCaching = true;
	
	/**
	 * Check for documents with the same content
	 */
	protected boolean duplicateCheck = false;
	
	/**
	 * initializes the robot with the default implementation 
	 * of the TaskList interface
	 * 
	 * @param expected document count
	 */
	public WebRobot(int expectedDocumentCount) {
		log = Category.getInstance(getClass().getName());
		content2UrlMap = new HashMap(expectedDocumentCount);
		registerVisitedList(new HashedMemoryTaskList(false,
					expectedDocumentCount));
		registerToDoList(new HashedMemoryTaskList(true,
					expectedDocumentCount));
		this.expectedDocumentCount = expectedDocumentCount;
		this.setAgentName(AGENT_NAME);
	}

	/**
	 * initializes the robot with the default implementation of the TaskList
	 * interface
	 */
	public WebRobot() {
		this(DEFAULT_EXPECTED_DOCUMENT_COUNT);
	}
	
	/**
	 * Sets the implementation class for the backend task list storage.
	 * WebRobot uses the TaskList interface to store future tasks.
	 *
	 * If you want to use your own TaskList implementation, just call
	 * this method.
	 * 
	 * @param todo TaskList to be used for the "to do" list
	 */
	public void registerToDoList(TaskList todo) {
		this.todo = todo;
	}

	/**
	 * Sets the implementation class for the backend task list storage.
	 * WebRobot uses the TaskList interface to store URLs that have
	 * been retrieved before.
	 *
	 * If you want to use your own TaskList implementation, just call
	 * this method.
	 * 
	 * @param visited TaskList to be used for the list of visited URLs
	 */

        public boolean getstopIt(){
            return stopIt;
        }

	public void registerVisitedList(TaskList visited) {
		this.visited = visited;
	}

	/**
	 * @return the start URL for this robot
	 */
	public URL getStartURL() {
		return startURL;
	}

	/**
	 * Sets the start URL for this robot
	 * @param startURL the start URL
	 */
	public void setStartURL(URL startURL) {
		String path = startURL.getPath();
		this.startURL = startURL;

                pathName = startURL.getHost();

		// is it a directory ?
		if (path.endsWith("/")) {
			this.startDir = startURL.getHost() + path;
                       
		} else {
			int pos = path.lastIndexOf("/");
			if (pos < 0) {
				// this happens for URLs without a path
				this.startDir = startURL.getHost() + "/";

			} else {
				this.startDir = startURL.getHost() + path.substring(0, pos + 1);

			}
		}
	}

	/**
	 * @return the maximal allowed search depth
	 */
	public int getMaxDepth() {
		return maxDepth;
	}

	/**
	 * sets the maximal search depth
	 * @param maxDepth
	 */
	public void setMaxDepth(int maxDepth) {
		this.maxDepth = maxDepth;
	}

	/**
	 * Get the value of bandwith of the used HttpTool
	 * @return value of bandwith.
	 */
	public int getBandwidth() {
		return httpTool.getBandwidth();
	}

	/**
	 * Set the value of bandwith  of the used HttpTool
	 * @param bandwidth  Value to assign to bandwith.
	 */
	public void setBandwidth(int bandwidth) {
		httpTool.setBandwidth(bandwidth);
	}

	/**
	 * gets the WalkToOtherHost status
	 * @return true if the Robot is allowed to travel to other
	 * host then the start host, false otherwise
	 */
	public boolean getWalkToOtherHosts() {
		return walkToOtherHosts;
	}

	/**
	 * sets the WalkToOtherHosts status
	 * @param walkToOtherHosts true if the Robot is allowed to travel to other
	 * host then the start host, false otherwise
	 */
	public void setWalkToOtherHosts(boolean walkToOtherHosts) {
		this.walkToOtherHosts = walkToOtherHosts;
	}

	/**
	 * gets the AllowWholeHost value
	 * @return true if the Robot is allowed to travel to the whole 
	 * host where it started from, false otherwise. If false, it is only
	 * allowed to travel to URLs below the start URL
	 */
	public boolean getAllowWholeHost() {
		return allowWholeHost;
	}

	/**
	 * sets the AllowWholeHost status
	 * @param allowWholeHost if true, the Robot is allowed to
	 * travel to the whole host where it started from. Otherwise it is only
	 * allowed to travel to URLs below the start URL.
	 */
	public void setAllowWholeHost(boolean allowWholeHost) {
		this.allowWholeHost = allowWholeHost;
	}

	/**
	 * Gets the AllowWholeDomain value.
	 * @return true if the Robot is allowed to travel to the whole 
	 * domain of the start host, false otherwise. 
	 * @see #setAllowWholeDomain(boolean)
	 */
	public boolean getAllowWholeDomain() {
		return allowWholeDomain;
	}

	/**
	 * Sets the AllowWholeDomain status
	 * @param allowWholeDomain if true, the Robot is allows to travel
	 * to all hosts in the same domain as the starting host. E.g. if you
	 * start at www.apache.org, it is also allowed to travel to
	 * jakarta.apache.org, xml.apache.org ...
	 */
	public void setAllowWholeDomain(boolean allowWholeDomain) {
		this.allowWholeDomain = allowWholeDomain;
	}

	/**
	 * Gets the state of flexible host checking (enabled or disabled).
	 *
	 * To find out if a new URL is on the same host, the robot usually
	 * compares the host part of both. Some web servers have an inconsistent
	 * addressing scheme and use the hostname www.domain.com and domain.com.
	 * With flexible host check enabled, the robot will consider both
	 * hosts as equal.
	 *
	 * @return true, if flexible host checking is enabled
	 */
	public boolean getFlexibleHostCheck() {
		return flexibleHostCheck;
	}

	/**
	 * Defines if the host test should be more flexible.
	 *
	 * To find out if a new URL is on the same host, the robot usually
	 * compares the host part of both. Some web servers have an inconsistent
	 * addressing scheme and use the hostname www.domain.com and domain.com.
	 * With flexible host check enabled, the robot will consider both
	 * hosts as equal.
	 *
	 * @param flexibleHostCheck set this true, to enable flexible host checking
	 * (disabled by default)
	 */
	public void setFlexibleHostCheck(boolean flexibleHostCheck) {
		this.flexibleHostCheck = flexibleHostCheck;
	}

	/**
	 * Gets the AllowCaching value.
	 * @return true if the Robot is allowed to cache documents in the
	 * docManager
	 * @see #setAllowCaching(boolean)
	 */
	public boolean getAllowCaching() {
		return allowCaching;
	}

	/**
	 * Sets the AllowCaching status
	 *
	 * @param allowCaching if true, the Robot is allows to use
	 * cached documents. That means it will first try to get teh document
	 * from the docManager cache and will only retrieve it if it is
	 * not found in the cache. If the cache returns a document, the robot
	 * will NEVER retrieve it again. Therefore, expiration mechanisms have
	 * to be included in the HttpDocManager method retrieveFromCache.
	 * @see net.matuschek.http.HttpDocManager#retrieveFromCache(java.net.URL)
	 */
	public void setAllowCaching(boolean allowCaching) {
		this.allowCaching = allowCaching;
	}

	/**
	 * @return the document manager of this robot
	 * @see HttpDocManager
	 */
	public HttpDocManager getDocManager() {
		return docManager;
	}

	/**
	 * Sets the document manager for this robot <br />
	 * Without a document manager, the robot will travel through the web but
	 * don't do anything with the retrieved documents (simply forget
	 * them). 
	 * A document manager can store them, extract information or 
	 * whatever you like. 
	 * There can be only one document manager, but you are free to combine
	 * functionalities of available document managers in a new object (e.g.
	 * to store the document and extract meta informations).
	 * @param docManager
	 */
	public void setDocManager(HttpDocManager docManager) {
		this.docManager = docManager;
	}

	/**
	 * Sets the CookieManager used by the HttpTool
	 * By default a MemoryCookieManager will be used, but you can
	 * use this method to use your own CookieManager implementation.
	 *
	 * @param cm an object that implements the CookieManager interface
	 */
	public void setCookieManager(CookieManager cm) {
		httpTool.setCookieManager(cm);
	}

	/**
	 * Gets the CookieManager used by the HttpTool
	 *
	 * @return the CookieManager that will be used by the HttpTool
	 */
	public CookieManager getCookieManager() {
		return httpTool.getCookieManager();
	}

	/**
	 * Sets the DownloadRule
	 * @param rule the download rule set to use
	 */
	public void setDownloadRuleSet(DownloadRuleSet rules) {
		httpTool.setDownloadRuleSet(rules);
	}

	/**
	 * Sets the URLCheck for this robot
	 * @param check
	 */
	public void setURLCheck(URLCheck check) {
		this.urlCheck = check;
	}

	/** 
	 *  sets a proxy to use 
	 *  @param proxyDescr the Proxy definition in the format host:port
	 */
	public void setProxy(String proxyDescr) throws HttpException {
		httpTool.setProxy(proxyDescr);
	}

	/**
	 * @return the current proxy setting in the format host:port
	 */
	public String getProxy() {
		return httpTool.getProxy();
	}

	/**
	 * @return the Referer setting for the first HTTP reuest
	 */
	public String getStartReferer() {
		return startReferer;
	}

	/**
	 * sets the Referer setting for the first HTTP reuest
	 * @param startReferer an URL (e.g. http://www.matuschek.net)
	 */
	public void setStartReferer(String startReferer) {
		this.startReferer = startReferer;
	}

	/**
	 * should we ignore robots.txt Robot Exclusion protocol ?
	 * @param ignoreRobotsTxt if set to true, the robot will ignore
	 * the settings of the /robots.txt file on the webserver
	 * <b>Know what you are doing if you change this setting</b>
	 */
	public void setIgnoreRobotsTxt(boolean ignoreRobotsTxt) {
		robCheck.setIgnore(ignoreRobotsTxt);
	}

	/** 
	 * @return the sleeptime setting
	 */
	public int getSleepTime() {
		return sleepTime;
	}

	/**
	 * set the sleeptime<br />
	 * after every retrieved document the robot will wait this time
	 * before getting the next document. this allows it to limit the
	 * load on the server
	 * @param sleeptime wait time in seconds
	 */
	public void setSleepTime(int sleepTime) {
		this.sleepTime = sleepTime;
	}

	/**
	 * sets the From: HTTP header<br />
	 * this should be a valid email address. it is not needed for the robot,
	 * but you should use it, because the administrator of the web server
	 * can contact you if the robot is doing things that he don't want
	 * @param fromAdress an RFC 822 email adress
	 */
	public void setFromAddress(String fromAddress) {
		httpTool.setFromAddress(fromAddress);
	}

	/**
	 * sets the list of form handlers
	 * @see net.matuschek.html.FormHandler for more 
	 * information about form handlers
	 */
	public void setFormHandlers(Vector handlers) {
		formFiller.setFormHandlers(handlers);
		if (handlers != null && handlers.size() > 0) {
			hasFormHandlers = true;
		}
	}

	/**
	 * @return the list of form handlers
	 * @see net.matuschek.html.FormHandler for more information 
	 * about form handlers
	 */
	public Vector getFormHandlers() {
		return formFiller.getFormHandlers();
	}

	/**
	 * Gets the name of the "User-Agent" header that the robot will use
	 * @return the user agent name 
	 */
	public String getAgentName() {
		if (httpTool != null) {
			return httpTool.getAgentName();
		} else {
			return null;
		}
	}

	/**
	 * sets the Agent-Name authentication for this robot
	 * @param name a name for this robot 
	 * (e.g. "Mozilla 4.0 (compatible; Robot)")
	 */
	public void setAgentName(String name) {
		httpTool.setAgentName(name);
		// robCheck = new NoRobots(ROBOT_NAME, httpTool);
		robCheck = new NoRobots(name, httpTool);
	}

	/**
	 * Gets the timeout for getting data in seconds of the used HttpTool
	 * @return the value of sockerTimeout
	 * @see #setTimeout(int)
	 */
	public int getTimeout() {
		if (httpTool != null) {
			return httpTool.getTimeout();
		} else {
			return -1;
		}
	}

	/**
	 * Sets the timeout for getting data. If HttpTool can't read data from a
	 * remote web server after this number of seconds it will stop the download
	 * of the current file
	 * @param timeout Timeout in seconds
	 */
	public void setTimeout(int timeout) {
		httpTool.setTimeout(timeout);
	}

	/**
	 * Gets the ntlmAuthentication of the robot
	 * @return the ntlmAuthentication
	 */
	public NTLMAuthorization getNtlmAuthorization() {
		if (httpTool != null) {
			return httpTool.getNtlmAuthorization();
		} else {
			return null;
		}
	}

	/**
	 * sets a ntlmAuthentication for this robot
	 * @param ntlmAuthentication for this robot 
	 */
	public void setNtlmAuthorization(NTLMAuthorization ntlmAuthorization) {
		httpTool.setNtlmAuthorization(ntlmAuthorization);
	}

	/**
	 * Gets the setting of the IgnoreRobotsTxt property
	 * @return true if robots.txt will be ignored, false otherwise
	 */
	public boolean getIgnoreRobotsTxt() {
		return ignoreRobotsTxt;
	}

	/**
	 * Gets a vector of URLs that can be visited more then once
	 * @return a vector containing URLs formated as Strings
	 */
	public Vector getVisitMany() {
		return visitMany;
	}

	public void setVisitMany(Vector visitMany) {
		this.visitMany = visitMany;
	}

	public void setHttpToolCallback(HttpToolCallback callback) {
		httpTool.setCallback(callback);
	}

	public WebRobotCallback getWebRobotCallback() {
		return webRobotCallback;
	}

	public void setWebRobotCallback(WebRobotCallback webRobotCallback) {
		this.webRobotCallback = webRobotCallback;
	}

	/**
	 * Sets the sleep status for this robot. If a WebRobot is set to sleep
	 * after starting run(), is will wait after retrieving the current document
	 * and wait for setSleep(false)
	 */
	public void setSleep(boolean sleep) {
		this.sleep = sleep;
	}

	/**
	 * Is the robot sleeping ?
	 */
	public boolean isSleeping() {
		return this.sleep;
	}

	/** 
	 * Set the list of allowed URLs
	 * @param allowed a Vector containing Strings. URLs will be checked
	 * if they begin of a string in this vector
	 */
	public void setAllowedURLs(Vector allowed) {
		this.allowedURLs = allowed;
	}

	/**
	 * Gets the list of allowed URLs
	 * @return a Vector containing Strings
	 * @see #setAllowedURLs(Vector)
	 */
	public Vector getAllowedURLs() {
		return this.allowedURLs;
	}
	
	/**
	 * Enable/disable cookies
	 * @param enable if true, HTTP cookies will be enabled, if false
	 * the robot will not use cookies
	 */
	public void setEnableCookies(boolean enable) {
		httpTool.setEnableCookies(enable);
	}

	/**
	 * Get the status of the cookie engine
	 * @return true, if HTTP cookies are enabled, false otherwise
	 */
	public boolean getEnableCookies() {
		return httpTool.getEnableCookies();
	}

	/** 
	 * Set the maximum age of documents to retrieve to this number
	 * of seconds
	 * @param maxAge integer value of the maximum document age 
	 * (in seconds), negative value means no limit.
	 */
	public void setMaxDocumentAge(long maxAge) {
		this.maxDocumentAge = maxAge;
	}
	


	/**
	 * Gets the maximum age of documents to retrieve
	 * @return maximum document age (in seconds), negative value means 
	 * no limit.
	 */
	public long getMaxDocumentAge() {
		return this.maxDocumentAge;
	}

	/**
	 * Sets a FilterChain. If teh WebRobot use a FilterChain it will
	 * process any retrieved document by this FilterChain before
	 * storing it
	 *
	 * @param filter a FilterChain to use for filtering HttpDocs
	 */
	public void setFilters(FilterChain filters) {
		this.filters = filters;
	}

	/**
	 * Delete all cookies
	 */
	public void clearCookies() {
		httpTool.clearCookies();
	}

	/**
	 * thread run() method, simply calls work()
	 * @see #work()
	 */
	public void run() {
		work();
	}

	/**
	 * do your job travel through the web using the configured 
	 * parameters and retrieve documents
	 */
	public void work() {
		RobotTask task = createRobotTask(startURL, maxDepth, startReferer);
		todo.add(task);
		walkTree();
		// ok, we did it, clean up dynamic data (the vistited vector)
		cleanUp();
		log.info("Documents retrieved by: Web=" + countWeb + " Cache=" + countCache + " Refresh=" + countRefresh+ " NoRefresh=" + countNoRefresh);

	}

	/**
	 * stop the current robot run 
	 * note that this will not abourt the current download but stop after
	 * the current download has finished
	 */
	public void stopRobot() {
		stopIt = true;
	}

	/**
	 * Holds information about memory status.
	 * @see handleMemoryError(OutOfMemoryError)
	 */
	private int memoryLevel = 0;
	
	/** Can new tasks be added? (may depend on memoryLevel) */
	protected boolean activatedNewTasks = true;
	
	/** Are visited URLs collected? (may depend on memoryLevel) */
	protected boolean activatedUrlHistory = true;
	
	/** Are visited contents collected? (may depend on memoryLevel) */
	protected boolean activatedContentHistory = true;
	
	/** memory buffer of 200 KB to be freed in case of urgent memory needs */
	private byte memoryBuffer[] = new byte[200 * 1024];

	/**
	 * do your job !
	 */
	
	public void walkTree() {
		while ((todo.size() > 0) && (!stopIt)) {
			RobotTask task;
			synchronized(visited) {
				task = todo.removeFirst();
				if (visited.contains(task) && (!visitMany.contains(task.getUrl().toString()))) {
					log.debug("already visited: " + task.getUrl());
					continue;
				}
				if (activatedUrlHistory) {
					visited.add(task);
				}
			}
			
			boolean repeat = true;
			while (repeat && (!stopIt)) {
				try {
					retrieveURL(task);
					repeat = false;
				} catch (OutOfMemoryError memoryError) {
					handleMemoryError(memoryError); 
				}
			}

			// sleep, if sleep is set to true
			while (sleep) {
				// callback
				if (webRobotCallback != null) {
					webRobotCallback.webRobotSleeping(true);
				}

				try {
					Thread.sleep(1000);
				} catch (InterruptedException e) {
				};
			}

			// callback
			if (webRobotCallback != null) {
				webRobotCallback.webRobotSleeping(false);
			}

			// callback
			if (webRobotCallback != null) {
				webRobotCallback.webRobotUpdateQueueStatus(todo.size());
			}
			spawnThread();
		}

		// callback
		if (webRobotCallback != null) {
			finishThreads();
		}
	}

	/**
	 * Implements OutOfMemory handling strategies.
	 * Action depends on memoryLevel
	 * @param memoryError
	 * @throws OutOfMemoryError
	 */
	protected void handleMemoryError(OutOfMemoryError memoryError)
		throws OutOfMemoryError {
		memoryLevel++;
		log.error("OutOfMemoryError level=" + memoryLevel + "! (visited=" + visited.size() + ", todo=" + todo.size() + ")");
		switch (memoryLevel) {
			case 1:
				// don�t remember visited URLs and contents any more
				// and try it again
				visited.clear(); activatedUrlHistory = false;
				content2UrlMap.clear(); activatedContentHistory = false;
				System.gc();
				break;
			case 2:
				// stop adding new Tasks, just process todo-list.
				// free memory buffer 
				// and try it again 
				activatedNewTasks = false;
				memoryBuffer = null;
				System.gc();
				break;
			case 3:
				// there is nothing we can do any more.
				// throw exception to stop robot
				throw memoryError;
			default :
				// Should never be reached.
				if (memoryBuffer != null) {
					// avoid removal of memoryBuffer by compiler
					System.err.println(memoryBuffer[0]);
				}
				throw memoryError;
		}
	}

	/**
	 * calls webRobotDone and finishes docManager if 
	 * executed in mainThread
	 */
	protected void finishThreads() {
		webRobotCallback.webRobotDone();
		if (docManager != null) {
		  docManager.finish();
		}
	}
	
	/**
	 * Start subThreads for spidering.
	 * WARNING: Should only be implemented and used for local
	 * spidering purposes!
	 */
	protected synchronized void spawnThread() {
	}
	
	/** counter for calls of retrieveURL */
	protected int iteration = 0;
	
	/**
	 * retrieve the next URL, save it, extract all included links and
	 * add those links to the tasks list
	 * @param task task to retrieve, function does nothing if this is null
	 */
	public void retrieveURL(RobotTask task) {
		if (task == null) {
			log.debug("Empty task found, ignoring");
			return;
		}
		
		long now = System.currentTimeMillis();

		updateProgressInfo();

		URL u = task.getUrl();
		String urlString = u.toString();
		String referer = task.getReferer();
		int depth = task.getMaxDepth();

		if (depth < 0) {
			log.info("Max search depth reached");
			return;
		}

		// we may need this additional check even if we
		// tested it during adding to the tasks list 
		if (!isAllowed(u)) {
			log.info("Url '" + u + "' filtered out.");
			return;
		}

		if (u.getFile().equals("")) {
			try {
				urlString = urlString + "/";
				u = new URL(urlString);
				// fix for double retrieved files
				task.setUrl(u);
			} catch (MalformedURLException e) {
				log.error("URL not well formed: " + e.toString());
				// use exception handler to handle exception
				exceptionHandler.handleException(this, u, e);
				return;
			}
		}

		log.info("retrieving " + urlString);
		httpTool.setReferer(referer);

		HttpDoc doc = null;
		Vector links = null;
		boolean cached = false;

		// look in the cache first, but only for static pages
		boolean reScan = true;
		if ((docManager != null && allowCaching)
			&& (task.getMethod() == HttpConstants.GET)
			&& (task.getParamString() == null)) {
			doc = docManager.retrieveFromCache(u);
/*			if (doc != null) {
				try {
					links = ((UrlCollector) docManager).retrieveLinks(doc);
				} catch (IOException e) {
					log.info("Could not get links for " + u + ": " + e.getMessage());
					links = null;
				} 
			}*/
			
			if (doc != null) {
				countCache++;
				long lastRetrieved = doc.getDateAsMilliSeconds();
				double ageInSeconds = (now - lastRetrieved) / 1000;
				if (ageInSeconds < 0) {
					log.warn("DocumentAge < 0!");
				}
				reScan = maxDocumentAge >= 0 && ageInSeconds > maxDocumentAge;
				if (reScan) {
					long lastModified = doc.getLastModifiedAsMilliSeconds();
					Date lastModifiedDate = new Date(lastModified);
					httpTool.setIfModifiedSince(lastModifiedDate);
				}
			} else {
				httpTool.setIfModifiedSince(null);
			}
		}

		// if not found in cache, retrieve from the web page
		if (reScan) {
			HttpDoc newDoc;
			boolean error = false;
			try {
				if (u.getProtocol().equalsIgnoreCase("file")) {
					// retrieve from file
					newDoc = retrieveFileURL(u, httpTool.getIfModifiedSince());
				} else {
					// retrieve from Web
					newDoc = httpTool.retrieveDocument(u, task.getMethod(), task.getParamString());
					if (newDoc != null) {
						newDoc.setDate(now);
					}
					sleepNow();
				}
				
				if (newDoc!= null && !newDoc.isNotModified()) {
					if (!(newDoc.isOk() || newDoc.isRedirect())) {
						error = true;
					}
				} else {
					// (newDoc == null || newDoc.isNotModified()) && doc != null 
					// -> Not modified
					// -> refresh time stamp
					if (doc != null) {
						doc.setDate(now);
						doc.setCached(false);
						newDoc = null;
					}
				}
			} catch (HttpException hex) {
				error = true; newDoc = null;
			}
			if (error) {
				int retry = task.retry();
				if (retry <= maxRetries) {
					synchronized(visited) {
						todo.add(task);
						visited.remove(task);
					}
					log.info("Adding " + u + " for retry no. " + retry);
					return;
				} else {
					doc = docManager.retrieveFromCache(u);
					if (doc == null) {
						log.warn("Unsuccessfull retries for " + u);
						return;
					} else {
						long docDate = doc.getDateAsMilliSeconds();
						long age = (now - docDate);
						age /= 1000;
						if (expirationAge < 0 || age < expirationAge) {
							newDoc = doc;
							cached = true;
							log.info("Cached document not expired: " + u);
						} else {
							log.warn("Cached document expired: " + u);
							docManager.removeDocument(u);
							return;
						}
					}
				}
			}
			
			if (newDoc != null) {
				countWeb++;
				doc = newDoc;
				links = null; // force recalculation of links
				countRefresh++;
			} else {
				cached = true;
				countNoRefresh++;
			}
		} else {
			cached = true;
			log.debug("Page " + u + " retrieved from cache");
		}

		// Add it to the visited vector
		// needs to be synchronized with todo-list
//		visited.add(task); 
		
		// got a NULL document, that doc was not retrieved
		// usually, it was not downloaded because a rule didn't allow
		// to download it
		if (doc == null) {
			log.info("not downloaded " + u);
			return;
		}

		// Duplicate check
		String duplicate=null;
		if (duplicateCheck) {
			duplicate = getContentVisitedURL(doc);
			if (duplicate != null) {
				log.info("URLs with same content found: " + urlString + " = " + duplicate);
			} else {	
				try {
					duplicate = docManager.findDuplicate(doc);
					if (duplicate != null) {
						log.info("URLs with same content found in cache: " + urlString + " = " + duplicate);
					}
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
			
			if (duplicate != null) {
				String pureDuplicate = removeParameters(duplicate);
				String pureUrl = removeParameters(urlString);
				if (!pureUrl.equals(pureDuplicate) && !cached) {
					// different url not yet stored -> store it
					try {
						// retrieve links from original
						HttpDoc linksDoc = docManager.retrieveFromCache(new URL(duplicate));
						if (linksDoc != null) {		
							doc.setLinks(linksDoc.getLinks());
						}
						docManager.storeDocument(doc);
					} catch (Exception e) {
						e.printStackTrace();
					}
				}
				RobotTask newTask;
				try {
					newTask = createRobotTask(new URL(duplicate), depth, referer);
					// check already here for visited tasks to save memory
					if (!visited.contains(newTask)) {
						addTask(newTask);
					}
				} catch (MalformedURLException e) {
					e.printStackTrace(); // Can�t happen
				}
				return;
			} 
		}

		// was it an UnAuthorized document ?
		if (doc.isUnauthorized()) {
			log.info("got HTTP Unauthorized for URL " + u);
		}

		if (doc.isOk() || cached) {
			// callback
			if (webRobotCallback != null) {
				int contentLength=0;
				if (doc.getContent() != null) { contentLength=doc.getContent().length; }
				webRobotCallback.webRobotRetrievedDoc(urlString, contentLength);
			}

			// extract links
			try {
				if (doc.isHTML() && (depth > 0)) {
					// solving encoding problem
					// HtmlDocument htmlDoc = new HtmlDocument(u, doc.getContent());
					HtmlDocument htmlDoc = null;
					HttpHeader contentTypeHeader = doc.getHeader("Content-type");
					if (contentTypeHeader != null) {
						String contentType = contentTypeHeader.getValue();
						int index = contentType.toLowerCase().indexOf("charset=");
						if (index > 0) {
							htmlDoc = new HtmlDocument(u, doc.getContent(), contentType.substring(index+8));
						} else {
							htmlDoc = new HtmlDocument(u, doc.getContent());
						}
					} else {
						htmlDoc = new HtmlDocument(u, doc.getContent());
					}
	
					// add links
					
					// this depth-check is critical!
					// otherwise far too many RobotTasks will be created
					// this will cause a premature OutOfMemoryException!
					if (depth > 0) {
						if (duplicate != null) {
							HttpDoc linksDoc = docManager.retrieveFromCache(new URL(duplicate));
							doc.setLinks(linksDoc.getLinks());
						} else if (cached) {
						} 
						if (links == null) {
							links = htmlDoc.getLinks();
							doc.setLinks(links);
						}
						if (duplicate == null) {
							HashSet checkedLinks = new HashSet();
							for (int i = 0; i < links.size(); i++) {
								URL link = (URL) links.elementAt(i);
								log.info("Link: "+link);
								// check already here for duplicate links to avoid expensive
								// creation of RobotTasks
								if (!checkedLinks.contains(link)) {
									checkedLinks.add(link);
									String myReferer = u.toString();
									if (u.getUserInfo() != null) {
										// remove userinfo from referer
										int endindex = myReferer.indexOf("@")+1;
										myReferer = "http://"+ myReferer.substring(endindex);
									}
									
									RobotTask newTask = createRobotTask((URL) links.elementAt(i), depth - 1, myReferer);
									// check already here for visited tasks to save memory
									if (!visited.contains(newTask)) {
										// bad workaround to retrieve images first
										if (newTask.urlString.endsWith(".jpg")) {
											addTaskAtStart(newTask);
										} else {
											addTask(newTask);
										}
									}
								}
							}
						}
					}
					
					if (hasFormHandlers) {
						// add forms
						Vector forms = htmlDoc.getElements("form");
						for (int i = 0; i < forms.size(); i++) {
							ExtendedURL eurl = formFiller.fillForm(u, (Element) forms.elementAt(i));
							if (eurl != null) {
								RobotTask newTask = createRobotTask(eurl.getURL(), depth - 1, u.toString());
								newTask.setParamString(eurl.getParams());
								newTask.setMethod(eurl.getRequestMethod());
								addTask(newTask);
							}
						}
					}
	
				}
			// catch any occuring error to keep on processing
			} catch (OutOfMemoryError e) {
				throw e;
			} catch (Throwable e){
				log.error("Unexpected error while extraction links from url '" + u + "':"+e);
				e.printStackTrace();
				// continue processing
			}

			// filter and store the document
			if ((docManager != null)) {
				try {
					if (filters != null) {
						doc = filters.process(doc);
					} else {
						log.debug("No filters defined");
					}
					
					if (isProcessingAllowed(doc)) {
						docManager.processDocument(doc);
					} else	{
						String md5 = doc.getHeaderValue(HttpHeader.CONTENT_MD5);
						doc.setContent("Not for indexing".getBytes());
						doc.setHeaderValue(HttpHeader.CONTENT_MD5, md5);
					} 
					
					try {
						docManager.storeDocument(doc);
					} catch (Exception e) {
						log.warn("could not store (not for indexing) " + urlString + ": " + e.getMessage());
					}
					if (activatedContentHistory && duplicate==null) {
						setContentVisitedURL(doc, urlString);
					}
				} catch (DocManagerException e1) {
					log.error("could not process document: " + e1.getMessage());
					exceptionHandler.handleException(this, u, e1);
				} catch (FilterException e2) {
					log.error(e2.getMessage());
				}
			}

		} else {
			// it was NOT a 200 return code !

			if (doc.isRedirect()) {
				String ref = doc.getLocation();
				log.info("Got redirect to " + ref);

				try {
					URL u2 = new URL(u, ref);
					// is it on another host ?

					// On a redirect, browsers use the old Referer instead of the
					// URL that got this redirect
					// Therefore we do not use u.toString as Referer but the old Referer
					RobotTask newTask = createRobotTask(u2, depth - 1, referer);

					// it will be inserted at the beginning of the vector !
					addTaskAtStart(newTask);
				} catch (MalformedURLException e) {
					// ignore this URL
				}
				// handle other values
			} else if (doc.isNotFound()) {
				// the document was not found
				exceptionHandler.handleException(this, u, new HttpException("Document not found"));
			} else if (doc.isUnauthorized()) {
				// the document was not found
				exceptionHandler.handleException(
					this,
					u,
					new HttpException("No authorization for the document."));
			} else {
				// an other error occured.
				exceptionHandler.handleException(this, u, new HttpException("Unknown document error (Http return code "+doc.getHttpCode()+")."));
			}
		}
	}

	/**
	 * Inform about spidering progress.
	 * May use iteration, startTime,
	 * countCache, countWeb, countRefresh, countNoRefresh
	 */
	public void updateProgressInfo() {
	}

	/**
	 * sleep for sleepTime seconds.
	 */
	public void sleepNow() {
		if (sleepTime > 0) {
			synchronized(this) {
				if (webRobotCallback != null) {
					webRobotCallback.webRobotSleeping(true);
				}
				
				try {
					Thread.sleep(sleepTime * 1000);
				} catch (InterruptedException e) {
				}
			
				if (webRobotCallback != null) {
					webRobotCallback.webRobotSleeping(false);
				}
			}
		}
	}

	/**
	 * retrieves a file from the local file system.
	 * @param url the url of the file to retrieve
	 * @return HttpDoc containing the content and mime type
	 */
	private HttpDoc retrieveFileURL(URL url, Date ifModifiedSince) throws HttpException {
		HttpDoc doc = new HttpDoc();

		try {
			String host = url.getHost();
			String filename = url.getFile();
			if ((host == null) || (host.equals(""))) {
				// local file
				// remove leading / or \
				if ((filename.startsWith("\\")) || (filename.startsWith("/"))) {
					filename = filename.substring(1);
				}
			} else {
				filename = "//" + host + filename;
			}
			// get the mimetype and put in the http header
			String mimetypestr = getMimeTypeForFilename(filename);
			if (mimetypestr != null) {
				HttpHeader header = new HttpHeader("content-type", mimetypestr);
				doc.addHeader(header);
			}
			
			// get the content from the file
			File file = new File(filename);
			if (!file.exists()) {
				doc.setHttpCode("httpcode " + HttpConstants.HTTP_NOTFOUND);
				return doc;
			}
			long fileLastModified = file.lastModified();
			long ifModifiedSinceTime = ifModifiedSince == null ? 0 : ifModifiedSince.getTime();
			if (fileLastModified > ifModifiedSinceTime) {
				byte[] content = readFileToByteArray(file);
				doc.setContent(content);
				doc.setHttpCode("httpcode " + HttpConstants.HTTP_OK);
			} else {
				doc.setHttpCode("httpcode " + HttpConstants.HTTP_NOTMODIFIED);
			}
			doc.setLastModified(fileLastModified);
			doc.setDate(System.currentTimeMillis());
			doc.setURL(url);
			
			return doc;
		} catch (Exception e) {
			throw new HttpException(e.getMessage());
		}
	}

	/**
	 * Get the Mime type for the given filename.
	 * @param filename
	 * @return Mime type
	 */
	protected String getMimeTypeForFilename(String filename) {
		if (filename.endsWith(".html") || filename.endsWith(".htm")) {
			return "text/html";
		} else {
			return null;
		}
	}
	
	/** 
	 * Clean up temporary data
	 */
	protected void cleanUp() {
		stopIt = false;
		visited.clear();
		todo.clear();
	}

	/** 
	 * adds a new task to the task vector but does some checks to 
	 */
	protected void addTask(RobotTask task) {
		if (taskAddAllowed(task) && activatedNewTasks) {
			todo.add(task);
		}
	}

	/** 
	 * adds a new tasks at the beginning of the tasks list 
	 * @see #addTask(RobotTask)
	 */
	protected void addTaskAtStart(RobotTask task) {
		if (taskAddAllowed(task) && activatedNewTasks) {
			todo.addAtStart(task);
		}
	}

	/**
	 * Checks if a tasks should be added to the task list
	 * @param robotTask 
	 * @return true if this tasks can be added to the task list,
	 * false otherwise
	 */
	protected boolean taskAddAllowed(RobotTask task) {
		if (task == null) {
			log.info("Null task not allowed");
			return false;
		}

		if (!isAllowed(task.getUrl())) {
			return false;
		}

		if (todo.contains(task)) {
			return false;
		}

		return true;
	}

	/**
	 * Is it allowed to travel to this new URL ?
	 * @param u the URL to test
	 * @return true if traveling to this URL is allowed, false otherwise
	 */
	protected boolean isAllowed(URL u) {

		// do the basic checks
		if (basicURLCheck(u)) {

			// if we have an URLCheck then test this URL against it 
			if ((urlCheck != null) && (!urlCheck.checkURL(u))) {
				log.debug("not allowed by URLCheck:" + u);
				return false;
			}

			if (robCheck.ok(u)) {
				return true;
			} else {
				log.debug("not allowed by robots.txt:" + u);
				return false;
			}
		}
		return false;
	}
	
	/**
	 * Is it allowed to process this document ?
	 * @param document
	 * @return true if processing of this URL is allowed
	 */
	protected boolean isProcessingAllowed(HttpDoc doc) {
		URL u = doc.getURL();
		if ((urlCheck != null) && (!urlCheck.checkURLForProcessing(u))) {
			log.debug("processing not allowed by URLCheck:" + u);
			return false;
		}
		
		DownloadRuleSet downloadRuleSet = httpTool.getDownloadRuleSet();
		if (downloadRuleSet != null && !downloadRuleSet.processAllowed(doc.getHttpHeaders())) {
			log.debug("processing not allowed by DownloadRuleSet:" + u);
			return false;
		}

		return true;
	}

	/**
	 * Basic URL allow check
	 * it is allowed to walk to a new URL if <ul>
	 *  <li>WalkToOtherHost is true. In this case there will be no additional
	 *      tests.</li>
	 *  <li>The new URL is located below the start URL, e.g. is the start URL
	 *      is http://localhost/test, the URL http://localhost/test/index.html
	 *      is allowed, but http://localhost/ is not allowed.</li>
	 *  <li>AllowWholeHost is true and the new URL is located on the same host
	 *      as the start URL.</li>
	 *  <li>FlexibleHostCheck is true and the host part of the current URL
	 *      is equal to the host part of the start URL modulo the prefix "www."
	 *      </li>
	 *  <li>The URL starts with a string in the "AllowedURLs" list.</li>
	 * </ul>
	 */
	protected boolean basicURLCheck(URL currURL) {
		String currURLStr = currURL.getHost() + currURL.getPath();
		String currHost = currURL.getHost().toLowerCase();
		String startHost = startURL.getHost().toLowerCase();

		// no more checks, if walkToOtherHosts is true
		if (walkToOtherHosts) {
			return true;
		}

		// new URL below start URL ?
		if (currURLStr.startsWith(startDir)) {
			return true;
		}

		// on the same host ?
		if (allowWholeHost && (currURL.getHost().equalsIgnoreCase(startURL.getHost()))) {
			return true;
		}

		// on the same host with flexible test (host name with and without "www."
		if (flexibleHostCheck) {
			if (cutWWW(currHost).equalsIgnoreCase(cutWWW(startHost))) {
				return true;
			}
		}

		// allow whole domain ?
		if (allowWholeDomain) {
			if (currHost.endsWith(getDomain(startHost))) {
				return true;
			}
		}

		// in the list of allowed URLs ?
		for (int i = 0; i < allowedURLs.size(); i++) {
			String s = (String) allowedURLs.elementAt(i);
			if (currURLStr.startsWith(s)) {
				return true;
			}
		}
		log.debug("URL " + currURLStr + " not allowed");
		return false;
	}

	/**
	 * remove a leading www. from a given hostname
	 * 
	 * @param hostname some hostname
	 * @return the hostname if it doesn't start with "www." otherwise
	 *  the hostname without the leading www.
	 */
	private String cutWWW(String hostname) {
		if (hostname.toLowerCase().startsWith("www.")) {
			return hostname.substring(4);
		} else {
			return hostname;
		}
	}

	/** 
	 * Gets the domain name of a given host (just delete everything
	 * to the last "."
	 *
	 * @param hostname some hostname
	 * @return the domain part of this hostname
	 */
	private String getDomain(String hostname) {
		int pos = hostname.indexOf(".");
		if (pos < 0) {
			// this should not happen !
			return hostname;
		} else {
			return hostname.substring(pos + 1);
		}
	}

	/**
	 * Method getExceptionHandler.
	 * @return RobotExceptionHandler the exceptionhandler of the robot
	 */
	public RobotExceptionHandler getExceptionHandler() {
		return exceptionHandler;
	}

	/**
	 * Method setExceptionHandler.
	 * sets the exceptionhandler of the robot
	 * @param newExceptionHandler the new exception handler
	 */
	public void setExceptionHandler(RobotExceptionHandler newExceptionHandler) {
		if (newExceptionHandler != null) {
			exceptionHandler = newExceptionHandler;
		}
	}

	/**
	 * Method setStart.
	 * sets the start URL 
	 * @param the startURL as String
	 */
	public void setStart(String startURL) {
		try {
			setStartURL(new URL(startURL));
		} catch (MalformedURLException e) {
			e.printStackTrace();
		}
	}

	/**
	 * Method getStart.
	 * gets the start url as string
	 * @return String
	 */
	public String getStart() {
		URL url = getStartURL();
		if (url != null) {
			return url.toExternalForm();
		} else {
			return null;
		}
	}

	/**
	 * This method finishes HttpTool, NoRobots, HttpDocManager.
	 */
	public void finish() {
		if (httpTool != null) {
			httpTool.finish();
		}
		if (robCheck != null) {
			robCheck.finish();
		}
		if (docManager != null) {
			docManager.finish();
		}
	}

	public static void main(String[] args) {
		if (args.length > 0) System.err.println("Arguments will be ignored!");
		Field[] fields = WebRobot.class.getDeclaredFields();
		StringBuffer str = new StringBuffer(60);
		for (int i = 0; i < fields.length; i++) {
			if (!Modifier.isFinal(fields[i].getModifiers())
				&& !Modifier.isStatic(fields[i].getModifiers())) {
				str.delete(0, str.length());
				str.append("		robot." + fields[i].getName() + " = " + fields[i].getName() + ";");
				while (str.length() < 50) {
					str.append(" ");
				}
				System.out.println(str.toString()+"// ("+fields[i].getType().getName()+")");
			}
		}
	}

	/** default expected count of documents */
	private static final int DEFAULT_EXPECTED_DOCUMENT_COUNT = 50000;
	
	/** expected count of documents */
	protected int expectedDocumentCount = DEFAULT_EXPECTED_DOCUMENT_COUNT;
	 
	/** remember visited content here (md5, urlString) */ 
	protected HashMap content2UrlMap;

	/**  counter for pages that were found in cache */
	long countCache = 0;
	
	/** counter for pages retrieved by web */
	long countWeb = 0;
	
	/** counter for pages that didn�t need a refresh */
	long countNoRefresh = 0;
	
	/** counter for refreshed pages (=cache+web) */
	long countRefresh = 0;
	
	/**
	 * Method getContentVisitedURL.
	 * Checks if the content was visited before and retrieves the corresponding URL.
	 * @param content
	 * @return found url or null if not found
	 */
	public String getContentVisitedURL(HttpDoc doc) {
		Object key = doc.getContentMD5();
		synchronized(content2UrlMap) {
			String url = (String) content2UrlMap.get(key);
			return url;
		}
	}
	
	/**
	 * Method setContentVisitedURL.
	 * Makes an URL retrievable by its content by entering it in content2UrlMap.
	 * @param content
	 * @param url
	 */
	public void setContentVisitedURL(HttpDoc doc, String url) {
		Object key = doc.getContentMD5();
		synchronized(content2UrlMap) {
			content2UrlMap.put(key, url);
		}
	}
	
	private final RobotTask createRobotTask(URL url, int maxDepth, String startReferer) {
		url = removeWasteParameters(url);
		return new RobotTask(url, maxDepth, startReferer);
	}

	/** only true if form-handlers are defined */
	boolean hasFormHandlers = false;
	
	/** list of wasteParameters (will be removed from URLs) **/
	protected Vector wasteParameters = new Vector();
	
	/** 
	 * Set the list of wasteParameters (will be removed from URLs)
	 * @param wasteParameters 
	 * if they begin of a string in this vector
	 */
	public void setWasteParameters(Vector wasteParameters) {
		this.wasteParameters = wasteParameters;
	}

	/**
	 * Gets the list of wasteParameters (will be removed from URLs)
	 * @return a Vector containing Strings
	 */
	public Vector getWasteParameters() {
		return this.wasteParameters;
	}

	/** Removes wasteParameters from URL.
	 * (eg. ID)
	 * @param url
	 * @return URL
	 */
	public URL removeWasteParameters(URL url) {
		String urlString = url.toExternalForm();
		String newUrlString = removeParametersFromString(urlString, wasteParameters);
		if (urlString != newUrlString) {
			try {
				url = new URL(newUrlString);
			} catch (MalformedURLException ex) {
				ex.printStackTrace();
			}
		};
		return url;
	}
	
	/**
	 * Remove passed Parameters from UrlString
	 * @param urlString
	 * @param wasteParameters
	 * @return String
	 */
	public static String removeParametersFromString(String urlString, Vector wasteParameters) {
		if (wasteParameters != null && wasteParameters.size() > 0) {
			int questionMark = urlString.indexOf("?");
			if (questionMark>0 && questionMark<urlString.length()) {
				int restPosition = urlString.indexOf("#", questionMark);
				String parameters;
				String rest;
				if (restPosition<0) {
					parameters = urlString.substring(questionMark+1);
					rest = null;
				} else {
					parameters = urlString.substring(questionMark+1,restPosition);
					rest = urlString.substring(restPosition);
				}
		  		
				StringBuffer filteredUrl = new StringBuffer(urlString.substring(0,questionMark));
				StringTokenizer tokenizer = new StringTokenizer(parameters, "&");
				String and = "?";
				boolean changed = false;
				while (tokenizer.hasMoreTokens()) {
					String token = tokenizer.nextToken();
					boolean keep = true;
					for (int w=0; w<wasteParameters.size(); w++) {
						String wasteParameter = (String) wasteParameters.elementAt(w);
						if (token.startsWith(wasteParameter + "=")) {
							keep = false; 
							changed = true;
							break;
						}
					}
					if (keep) {
						filteredUrl.append(and);
						filteredUrl.append(token);
						and = "&";
					}
				}
				if (rest != null) filteredUrl.append(rest);
				if (changed) {
					urlString = filteredUrl.toString();
				}
			}
		}
		return urlString;
	}
	
	/** time of WebRobot start in milliseconds */
	protected long startTime = System.currentTimeMillis();
	
	/** number of allowed retries for document retrieval */
	protected int maxRetries = 0;
	
	/**
	 * Set allowed retries for document retrieval
	 * @param maxRetries
	 */
	public void setMaxRetries(int maxRetries) { this.maxRetries = maxRetries; }
	
	/**
	 * Get allowed retries for document retrieval
	 * @return maxRetries
	 */
	public int getMaxRetries() { return maxRetries; }
	
	/** 
	 * expiration age of documents in cache.
	 * Documents older than expirationAge will be removed,
	 * negative value means no limit. 
	 */
	protected long expirationAge = -1;
	
	/**
	 * set expiration age of documents in cache.
	 * Documents older than expirationAge will be removed,
	 * negative value means no limit. 
	 * @param age
	 */
	public void setExpirationAge(long age) { expirationAge = age; }
	
	/**
	 * get expiration age of documents in cache.
	 * @return long
	 */
	public long getExpirationAge() { return expirationAge; }
	
	/**
	 * Remove Parameters from Url
	 * @param url
	 * @return url without parameters
	 */
	private final static String removeParameters(String url) {
		int pos = url.indexOf("?");
		return pos >= 0 ? url.substring(0,pos) : url;
	}
	
	/**
	 * Reads a File to a byte array.
	 * @param file
	 * @return byte[]
	 * @throws IOException
	 */
	protected byte[] readFileToByteArray(File file) throws IOException
	{
		FileInputStream in = null;

		try
		{
			byte[] buffer = new byte[(int) file.length()];
			in = new FileInputStream(file);
			in.read(buffer);

			return buffer;
		}
		finally
		{
			if (in != null)
			{
				try
				{
					in.close();
				}
				catch (IOException e)
				{
				}
			}
		}
	}
	
}

